A study of Asian Religious and Biblical Texts

In this dataset there is following books:
* Upanishads - are ancient Sanskrit texts of spiritual teaching and ideas of Hinduism. They are the part of the oldest scriptures of Hinduism, the Vedas, that deal with meditation, philosophy, and spiritual knowledge; other parts of the Vedas deal with mantras, benedictions, rituals, ceremonies, and sacrifices.

Old Testament :

More on wikipedia

**in Indian literary traditions refers to an aphorism or a collection of aphorisms in the form of a manual or, more broadly, a condensed manual or text. Sutras are a genre of ancient and medieval Indian texts found in Hinduism, Buddhism and Jainism.*

EDA

Looking at data

knitr::kable(
  data[1:10, 1:10], caption = 'Dataset',
  booktabs = TRUE
) %>% 
  kable_styling()
Dataset
X foolishness hath wholesome takest feelings anger vaivaswata matrix kindled
Buddhism_Ch1 0 0 0 0 0 0 0 0 0
Buddhism_Ch2 0 0 0 0 0 0 0 0 0
Buddhism_Ch3 0 0 0 0 0 0 0 0 0
Buddhism_Ch4 0 0 0 0 0 0 0 0 0
Buddhism_Ch5 0 0 0 0 0 0 0 0 0
Buddhism_Ch6 0 0 0 0 0 0 0 0 0
Buddhism_Ch7 0 0 0 0 0 0 0 0 0
Buddhism_Ch8 0 0 0 0 0 0 0 0 0
Buddhism_Ch9 0 0 0 0 0 0 0 0 0
Buddhism_Ch10 0 0 0 0 0 0 0 0 0

Looking at data we are certain that there is no sense in keeping chapters separated. We than used stringi package to extract names of books. We figured that we combine biblical texts into one as they have significantly less chapters than the rest. Than we truncated it to have only one book per row (word occurances were summed). We ended up with this dataframe:

book_name <- stri_extract(data$X, regex =  "^[a-zA-Z]+")
book_name <- ifelse(startsWith(book_name, "Bo"), "Bible",book_name)
data$book_name <- book_name
data <- data[,-1]
book_names <- unique(data$book_name)

df <- matrix(0, length(book_names), ncol = ncol(data)-1)
for (i in seq_along(book_names)){
  row <- colSums(data[data$book_name == book_names[i],1:(ncol(data)-1)])
  df[i,] <- row
}

df <- as.data.frame(df)

df <- cbind(book_names,df)
colnames(df) <- c( "book_name", colnames(data[,1:(ncol(data)-1)]))
m <- ncol(df)

knitr::kable(
  df[1:5, 1:10], caption = 'Dataset',
  booktabs = TRUE
) %>% 
  kable_styling()
Dataset
book_name foolishness hath wholesome takest feelings anger vaivaswata matrix kindled
Buddhism 0 0 0 0 19 0 0 0 0
TaoTeChing 0 0 0 0 0 1 0 0 0
Upanishad 0 0 0 0 0 3 1 0 1
YogaSutra 0 2 1 0 0 0 0 1 0
Bible 2 332 3 1 0 31 0 0 3

It is already better for visualization.

Visualization

Most common words per book

for (bn in book_names){
tmp <- sort(df[df$book_name == bn, 2:m], decreasing = T)
barplot(height = unlist(tmp[10:1]),
        las =2 ,
        horiz = TRUE,
        main = paste("Most frequent words in", bn),
        cex.names=0.7,
        col = "lightblue")
}

More interesting way to visualize words is word clouds

TeoTeChing

bn <- "TaoTeChing"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Bible

bn <-  "Bible" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Buddhism

bn <-  "Buddhism" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

Upnishad

bn <-  "Upanishad"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=80, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

YogaSutra

bn <-  "YogaSutra" 
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq  = tmp)

set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), main = bn)

How our chapters look categorized in books look treated with TSNE

library(Rtsne)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
tsne <- Rtsne(data[,1:8266], dims = 2, preplexity = 30,  verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.08 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 60.478213 (50 iterations in 0.22 seconds)
## Iteration 100: error is 60.305248 (50 iterations in 0.27 seconds)
## Iteration 150: error is 59.533257 (50 iterations in 0.19 seconds)
## Iteration 200: error is 59.502624 (50 iterations in 0.18 seconds)
## Iteration 250: error is 59.494416 (50 iterations in 0.08 seconds)
## Iteration 300: error is 1.123140 (50 iterations in 0.07 seconds)
## Iteration 350: error is 1.055831 (50 iterations in 0.06 seconds)
## Iteration 400: error is 1.038577 (50 iterations in 0.06 seconds)
## Iteration 450: error is 1.030946 (50 iterations in 0.06 seconds)
## Iteration 500: error is 1.024539 (50 iterations in 0.06 seconds)
## Fitting performed in 1.26 seconds.
data_to_plot <- as.data.frame(tsne$Y)

data_to_plot$label <- book_name

ggplot(data_to_plot, aes(x = V1, y = V2, color = label)) +
  geom_point() + 
  theme_bw() + 
  scale_color_manual(values = brewer.pal(8, "Set1"))

And how they look in 3d

tsne <- Rtsne(data[,1:8266], dims = 3, preplexity = 30,  verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.10 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 59.826617 (50 iterations in 0.41 seconds)
## Iteration 100: error is 59.752607 (50 iterations in 0.41 seconds)
## Iteration 150: error is 59.515524 (50 iterations in 0.23 seconds)
## Iteration 200: error is 59.504687 (50 iterations in 0.23 seconds)
## Iteration 250: error is 59.515569 (50 iterations in 0.22 seconds)
## Iteration 300: error is 0.977989 (50 iterations in 0.17 seconds)
## Iteration 350: error is 0.898545 (50 iterations in 0.16 seconds)
## Iteration 400: error is 0.872014 (50 iterations in 0.17 seconds)
## Iteration 450: error is 0.861557 (50 iterations in 0.17 seconds)
## Iteration 500: error is 0.854472 (50 iterations in 0.17 seconds)
## Fitting performed in 2.37 seconds.
data_to_plot <- as.data.frame(tsne$Y)

data_to_plot$label <- book_name

plot_ly(data_to_plot, x = ~V1, y = ~V2, z = ~V3, color = ~label, size = 0.1)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

word lengths

d <- data.frame(word_len = NULL, book = NULL)

for (bn in book_names){
tmp_df <- df[df$book_name == bn,]
word_list <- sapply(colnames(tmp_df)[2:ncol(tmp_df)], function(x) rep(nchar(x), tmp_df[x])  )
word_list <- unlist(word_list) 
names(word_list) <- NULL
p <- data.frame(word_len = word_list, book = rep(bn, length(word_list)))
d <- rbind(d, p)

}

ggplot(d, aes(x = word_len, fill = book)) + geom_density(adjust = 2, alpha = 0.5 ) + theme_minimal()

ggplot(d, aes(y = word_len, x= book,  fill = book)) + geom_boxplot() + theme_minimal()